#!/usr/bin/env python
# encoding: utf-8
# @Time : 2020/7/26 11:42 
# @Author : 能量咖啡豆 
# @File : c08exp.py 
# @desc : c08exp.py 
import numpy as np
import pandas as pd


def to_cat_list(catstr):
    stripped = (x.strip() for x in catstr.split(','))
    return [x for x in stripped if x]

def get_all_categories(cat_series):
    cat_sets = (set(to_cat_list(x)) for x in cat_series)
    return sorted(set.union(*cat_sets))

def get_english(cat):
    code, names = cat.split('.')
    if '|' in names:
        names = names.split(' | ')[1]
        return code, names.strip()

def get_code(seq):
    return [x.split('.')[0] for x in seq if x]

if __name__ == "__main__":
    print("c08 海地地震危机数据")
    data = pd.read_csv('data/Haiti.csv')

    data = data[(data.LATITUDE > 18) & (data.LATITUDE < 20) &
                (data.LONGITUDE > -75) & (data.LONGITUDE < -70) &
                data.CATEGORY.notnull()]

    all_cats = get_all_categories(data.CATEGORY)
    all_codes = get_code(all_cats)
    code_index = pd.Index(np.unique(all_codes))
    dummy_frame = pd.DataFrame(np.zeros((len(data), len(code_index))), index=data.index, columns=code_index)

    for row, cat in zip(data.index, data.CATEGORY):
        codes = get_code(to_cat_list(cat))
        dummy_frame.loc[row, codes] = 1

    data = data.join(dummy_frame.add_prefix('category_'))

    #print(data)